*** CA Tracks merging and cleaning

local trackdatafile "`1'"

local price_file "`2'"
local cpi_file "`3'"
local unem_file "`4'"

*** loading and preparing unemployment rate data *******************************
import delimited using "`unem_file'" , clear
gen date2 = date(date,"YMD")
drop date
rename date2 date
format  date %td
gen m = month(date)
gen y = year(date)
drop date
save unem, replace

*** loading and preparing fuel prices data *************************************
* get monthly CPI data 
import delimited using "`cpi_file'" , clear
gen date2 = date(date,"MDY")
drop date
rename date2 date
format  date %td
gen m = month(date)
gen y = year(date)
save cpi, replace

* load price data
import delimited using "`price_file'" , clear
drop index* volume bid ask open open_i	mean currency units description
gen date_num = date(date, "MDY")
drop date
rename date_num date
format date %td

* merging in CPI
gen m = month(date)
gen y = year(date)
merge m:m y m using "cpi" , gen(_mcpi)
drop if _mcpi~=3
drop _mcpi

* get base year cpi
sum cpiaucsl if y==2011 & m==1
scalar cpi_base = r(mean)

* deflator
gen cpi_def = cpiaucsl / cpi_base

* rename 
gen low_nom = low
gen high_nom = high
gen close_nom = close

* deflate
replace low = low/cpi_def
replace high = high/cpi_def
replace close = close/cpi_def

gen type = "IFO380"
replace type = "MGO" if symbol=="AAWYR00"
replace type = "MDO" if symbol=="POACP00"

drop symbol

reshape wide low* high* close* , j(type) i(date) string

egen IFO = rmean(lowIFO380  highIFO380) // mean of IFO high and low
egen MDO = rmean(lowMDO  highMDO) // mean of MDO high and low
egen MGO = rmean(lowMGO  highMGO) // mean of MGO high and low

* MDO and MGO overlap for about 4 months (sept 2012 to dec 2012) prices are identical
gen LS = MDO
replace LS = MGO if MDO==.

* this substracts dow from date ind
* therefor it gives an indicator for day 0 of the week of each observation
gen week_ind = date - dow(date)
format week_ind %td

collapse IFO MDO MGO LS , by(week_ind)

* collapsing to weekly averages
*** only have work week observations
*** not clear when boats actually fuel

gen LSprem  = LS - IFO
gen LSprem_perc = LSprem/IFO
label var LSprem "LS Premium"


sum LSprem_perc if week_ind>=mdy(1,1,2008) & week_ind<=mdy(1,1,2016)
sum LSprem_perc if week_ind>=mdy(7,1,2009) & week_ind<=mdy(8,1,2012)

* plots
local policy_date1 = mdy(7,1,2009)
local policy_date1_start = `policy_date1'-150
local policy_date1_end = `policy_date1'+150 

local policy_date2 = mdy(12,1,2011)
local policy_date2_start = `policy_date2'-150
local policy_date2_end = `policy_date2'+150 

twoway (line IFO week_ind ) (line LS week_ind , lpattern(shortdash) )   if week_ind>=mdy(1,1,2009) & week_ind<=mdy(8,1,2012) , ///
name(fuel_prices, replace) ytitle("$/ton") xtitle("") xlab(#6) ///
legend( label(1 "Residual") label(2 "Distillate") ) ylab( ,nogrid)  ///
xline(`policy_date1' `policy_date2' , lcolor(black)  ) /// lpattern(dash)
xline(`policy_date1_start' `policy_date1_end' `policy_date2_start' `policy_date2_end' , lcolor(gray) lpattern(dash) ) 
graph export "sum_stats/fuel_prices.eps", replace
graph drop fuel_prices

sum IFO LS LSprem_perc if week_ind>=mdy(7,1,2009) & week_ind<=mdy(8,1,2012) // CA ECA active
sum IFO LS LSprem_perc if week_ind>=mdy(1,1,2015) & week_ind<=mdy(1,1,2017) // NA ECA 

*twoway (line LSprem week_ind)    if week_ind>=mdy(1,1,2008)
save "fuelPrices", replace


********************************************************************************
* loading track data
import delimited using "`trackdatafile'" , clear asfloat
drop m y
*destring aisyear dist , replace

* this is full track id, so not useful (dropped in updated csvs)
capture drop interp_track_id

* adding observation id
*gen track_id = _n

* recoding missing values
mvdecode * , mv(-9999)

* cleaning length and width (0 is missing)
mvdecode length width , mv(0)

* encoding group
encode group, gen(group_i) 
label list group_i

encode group_agg, gen(group_agg_i) 
label list group_agg_i

* cleaning imo number
* destring if it is a string
capture confirm string var imo
if _rc==0 {
	replace imo="" if imo=="-9999"
	replace imo = subinstr(imo , "IMO" , "" , . ) 
	destring imo , replace
}

* cleaning up any imo numbers with more than 7 digits (with 0 at end)
* clearing last digit(s) if zero
gen imo_div = imo/1e7
replace imo = . if imo_div < 0.1 // removes any imo numbers that don't have 7 digits

replace imo = imo/10 if imo_div>1 & imo_div<10 & mod(imo,10)==0
replace imo = imo/100 if imo_div>10 & imo_div<100 & mod(imo,100)==0

replace imo_div = imo/1e7
replace imo = . if imo_div > 1 // removes any imo numbers that have more than 7

drop imo_div

* renaming
*rename shape_length dist
rename time_eca2009 hours_eca2009
rename time_eca2011 hours_eca2011

rename portcode_sn1 portSN1 
rename portcode_sn2 portSN2 

* dealing with dates/times
split time1 , p(" ")
split time2 , p(" ")

gen date1 = date(time11, "YMD")
gen date2 = date(time21, "YMD")
format date1 %td
format date2 %td

gen clock1 = clock(substr(time12,1,8),"hms")
gen clock2 = clock(substr(time22,1,8),"hms")
format clock1 %tcHH:MM:SS
format clock2 %tcHH:MM:SS

generate dt1 = dhms(date1,hh(clock1),mm(clock1),ss(clock1))
generate dt2 = dhms(date2,hh(clock2),mm(clock2),ss(clock2))
format dt1 %tcNN/DD/CCYY_HH:MM:SS
format dt2 %tcNN/DD/CCYY_HH:MM:SS

drop time1 time2 time12 time21 time11 time22

* generating year, month, and month*year indicators
* rule for date is 
* 1) use earliest date
* 2) unless earlier date is 
gen WC1 = port1<=12 //
gen WC2 = port2<=12 

gen date = date1
replace date = date2 if (WC1==0 & WC2==1)

gen dt = dt1
replace dt = dt2 if (WC1==0 & WC2==1) 
format dt %tcNN/DD/CCYY_HH:MM:SS

gen m = month(date)
gen y = year(date)
gen dow1 = dow(date)
gen q = quarter(date)
gen my = ym(y,m)
format my %tm 
gen qy = yq(y,q) // year by quarter
format qy %tq
gen t = date - mdy(1,1,2009)
gen t2 = t*t

* generate week identifier to merge in fuel prices
gen week_ind = date - dow(date)
format week_ind %td

gen smy = string(my,"%tm")
labmask my , val(smy)
drop smy

gen mmsi_scrambled = (aisyear>=2010) & (aisyear<=2014)

* generating unique vessel ids
* using IMO if available
* otherwise use mmsi, but accounting for scrambling
egen uniq_mmsi = group(mmsi mmsi_scrambled)
gen ves_id_tmp = imo
replace ves_id_tmp = uniq_mmsi/1000000 if ves_id_tmp==.
egen ves_id = group(ves_id_tmp)

rename vesseltype vesselType_detailed
encode vessel_str, gen(vesseltype) 
label list vesseltype
*drop vessel_str

* generating a hazardous category variable
* only really useful for 2009-until updated group numbers are used (4 digit ones)
gen hazard_cat = 0 
replace hazard_cat = 1  if inlist(vesselType_detailed, 21, 41, 61, 71, 81, 91)
replace hazard_cat = 2  if inlist(vesselType_detailed, 22, 42, 62, 72, 82, 92)
replace hazard_cat = 3  if inlist(vesselType_detailed, 23, 43, 63, 73, 83, 93)
replace hazard_cat = 4  if inlist(vesselType_detailed, 24, 44, 64, 74, 84, 94)
label var hazard_cat "Hazardous Category"
label define hazard_cat 0 "None" 1 "Major Hazard" 2 "Hazard" 3 "Minor Hazard" 4 "Recog Hazard"
label values hazard_cat hazard_cat

* https://help.marinetraffic.com/hc/en-us/articles/205579997-What-is-the-significance-of-the-AIS-Shiptype-number-
*category A = IMO category X - major hazard (no discharge)
*category B = IMO category Y - hazard (limit discharge)
*category C = IMO category Z - minor hazard (less stringent limits)
*category D = IMO category OS - recognisable hazard

gen hazard_ind = (hazard_cat>0) 
label var hazard_ind "Hazardous"


* flag for vessels coming from historic wfr
rename from_hist hist_wfr_only  

/*
tab hist_wfr_only y
distinct mmsi if hist_wfr_only==1
tab hist_wfr_only y if merged_chars==1
*/

***** WRITE SUMMARY STATS ******************************************************
egen ves_y_tag = tag(ves_id y) // sums to total vessel by year
egen ves_tag = tag(ves_id)    // sums to total vessel
gen ves_y_tag_noimo = ves_y_tag*(~has_imo) // sums to total vessels by year without imo
gen ves_y_tag_wfr = ves_y_tag*(merged_chars) // sums to total vessels by year with WFR
gen ves_y_tag_imo_nowfr = ves_y_tag*(~merged_chars)*has_imo // sums to total vessels by year with imo but not in wfr
gen merged_chars100 = merged_chars*100

/*
preserve
keep if has_imo & ~merged_chars
keep imo
duplicates drop imo, force 
export delimited using "C:\Users\rklotz\OneDrive\Work\Ports_ECAs\marine_traffic_scrape\imos_to_scrape.csv"
restore
*/

***** summary stats for data creation 
* Tracks, vessels, etc over time

tabout y using sum_stats/merge_summary_stats.tex if vesseltype=="Cargo":vesseltype & interp_flag==0 , oneway replace sum ///
	cells(N dist mean merged_chars100 sum ves_y_tag sum ves_y_tag_wfr sum ves_y_tag_noimo sum ves_y_tag_imo_nowfr) ///
	format( 0c p1 0c 0c 0c 0c  ) ptotal(none) h2( \textbf{Cargoships}\\  & Tracks & Tracks 	& Vessels & Vessels   & Vessels 	& Vessels \\ ) /// 
											  h3( &        & w. chars.  &  		  & w. chars. &  no IMO 	& no WFR  \\ ) ///
	style(tex) bt topf($top_tex) 

tabout y using sum_stats/merge_summary_stats.tex if vesseltype=="Tanker":vesseltype & interp_flag==0, oneway append sum ///
	cells(N dist mean merged_chars100 sum ves_y_tag sum ves_y_tag_wfr sum ves_y_tag_noimo sum ves_y_tag_imo_nowfr) ///
	format( 0c p1 0c 0c 0c 0c  ) ptotal(none) h2(\\ \textbf{Tankers} \\ )  h3(nil) ///
	style(tex) bt botf($bot_tex) 
	
drop ves_y_tag ves_tag ves_y_tag_noimo ves_y_tag_wfr ves_y_tag_imo_nowfr merged_chars100
********************************************************************************

* clean up length and width for vessels without WFR data
rename length length_ais
rename width width_ais

* use wfr data for length and width if have data from WFR
gen length = loa if merged_chars==1
gen width = beam if merged_chars==1

* throw error if any WFR vessels missing length and width
* originally there were none
sum length if length==. & merged_chars==1
if r(N)>0 {
	error
}

* if only in AIS then take modal length and width
replace length_ais=. if length_ais>400 | length_ais<70
replace width_ais=. if width_ais>60 | width_ais<10
*egen sd_length = sd(length_ais) if merged_chars==0, by(ves_id)
egen mode_length = mode(length_ais) if merged_chars==0, by(ves_id) maxmode
egen mode_width = mode(width_ais) if merged_chars==0, by(ves_id) maxmode
replace length = mode_length if merged_chars==0 
replace width = mode_width if merged_chars==0 

drop mode_width mode_length 

* clean AIS type (might not be used) *******************************************
* count number of vessel types (none other or NA) for each vessel id
gen type_noNA = vesseltype if ~(vesseltype=="NA":vesseltype)
egen tag = tag(ves_id type_noNA) // tag each ves_id and type
egen types = total(tag), by(ves_id) // count number of tags

* if types>0 - 1 or more non-NA type
* use modal type
egen modal_type = mode(vesseltype) , minmode by(ves_id)
gen type2 = modal_type if types>0
replace type2 = "NA":vesseltype if types==0
label value type2 vesseltype

rename vesseltype vesseltype_ais
rename type2 vesseltype

drop types modal_type tag type_noNA

* vessel variables
rename powertype power_str
encode power_str, gen(powertype) 
label list powertype

rename mainenginefueltype fuel_str
encode fuel_str, gen(fueltype) 
label list fueltype

* generating vessel power in horsepower
gen power_hp = ( power_kw*1.34102 ) / 1000
label var power_hp "Power (1000 HP)"


* getting marine identification digits and US flagged ships ********************
*gen USflag = inlist(mid, 338, 366, 367, 368, 369)
gen USflag = inlist(flag_country_mid , "United States of America" )
label var USflag "US Flag"

* calculating duration and avg speed
gen hours = (dt2-dt1)/(60000*60)
gen avg_speed = dist/hours
*gen speed_kmh_imp = dist/hours

* calculating distance/time/speed out of ECA
** removing negative distances, b/c these are all very small (1/10 of km)
** also removing negative times, b/c these were very small
** these tracks are effectively always out of ECA, just geoprocessing or rounding issue

gen dist_out2009 = max( 0 , dist - dist_eca2009 ) 
gen dist_out2011 = max( 0 , dist - dist_eca2011 ) 

gen hours_noeca2009 = max( 0, hours - hours_eca2009 ) 
gen hours_noeca2011 = max( 0, hours - hours_eca2011 ) 

gen noeca2009_ind = (dist_out2009<2) | (hours_noeca2009<1) 
gen noeca2011_ind = (dist_out2011<2) | (hours_noeca2011<1) 

*replace dist_eca2009=. if date1>= mdy(2,1,2012)
*replace dist_eca2011=. if date1<= mdy(8,1,2011)

*replace dist_out2009=. if date1>= mdy(2,1,2012)
*replace dist_out2011=. if date1<= mdy(8,1,2011)

gen avg_speed_eca2009 = dist_eca2009/hours_eca2009
gen avg_speed_eca2011 = dist_eca2011/hours_eca2011 

* only generating speed for distances greater than a kilometer
gen avg_speed_noeca2009 = dist_out2009/hours_noeca2009 if noeca2009_ind~=1
gen avg_speed_noeca2011 = dist_out2011/hours_noeca2011 if noeca2011_ind~=1

gen speed_diff_eca2009 = avg_speed_noeca2009 - avg_speed_eca2009 
gen speed_diff_eca2011 = avg_speed_noeca2011 - avg_speed_eca2011 


* distance in 1000 km
gen dist1000 = dist/1000 	
gen dist_eca2009_1000 = dist_eca2009/1000
gen dist_eca2011_1000 = dist_eca2011/1000
gen dist_out2009_1000 = dist_out2009/1000
gen dist_out2011_1000 = dist_out2011/1000

* getting fuel consumption per km (t/km)
gen tkm_cons = f_cons / dist 
gen tkm_power = f_power / dist 
gen tkm_eca09_cons = f_eca09_cons / dist_eca2009 
gen tkm_eca09_power = f_eca09_power / dist_eca2009
gen tkm_eca11_cons = f_eca11_cons / dist_eca2011 
gen tkm_eca11_power = f_eca11_power / dist_eca2011

gen tkm_out09_cons = f_out09_cons / dist_out2009 
gen tkm_out09_power = f_out09_power / dist_out2009
gen tkm_out11_cons = f_out11_cons / dist_out2011 
gen tkm_out11_power = f_out11_power / dist_out2011


* labeling
label variable dist "Distance (km)"
label variable dist_eca2009 "2009 ECA"
label variable dist_eca2011 "2011 ECA"
label variable dist_eca2009_1000 "2009 ECA"
label variable dist_eca2011_1000 "2011 ECA"
label variable dist_out2009_1000 "2009 ECA"
label variable dist_out2011_1000 "2011 ECA"

label variable avg_speed "Speed (km/h)"
label variable avg_speed_eca2009 "In 2009 ECA"
label variable avg_speed_eca2011 "In 2011 ECA"
label variable avg_speed_noeca2009 "Out 2009 ECA"
label variable avg_speed_noeca2011 "Out 2011 ECA"

label variable length "Length (m)"
label variable width "Width (m)"
 
label var length_ais "Length (m)"
label var width_ais "Width (m)"


* merging in fuel prices *******************************************************
merge m:1 week_ind using "fuelPrices" , gen(_mprices)
drop if _mprices==2
drop _mprices

* merging in unemployment rate**************************************************
merge m:1 m y using "unem" , gen(_munem)
drop if _munem==2
drop _munem

* adding value labels to port
label define portSN 1	"Hilo" ///
2	"Kahalui" ///
3	"Nawil" ///
4	"Honolulu" ///
5	"PHarbor" ///
6	"BPoint" ///
7	"Ensen" ///
8	"San Diego" ///
9	"LA/LB" ///
11	"Hueneme" ///
12	"San Francisco" ///
13	"HB" ///
14	"CB" ///
15	"Portland" ///
16	"GH" ///
17	"Seattle" ///
18	"Alb" ///
19	"PWSound" ///
20	"CookIn" ///
21	"Unimak Pass"
						 
label values portSN1 portSN
label values portSN2 portSN


encode route_name, gen(route_id) 
label list route_id

encode route_name_interp, gen(route_id_interp) 
label list route_id_interp


* routes - for broader aggregation of ports
egen port_agg1_id = group(port_agg1) , label
egen port_agg2_id = group(port_agg2) , label
rowsort port_agg1_id port_agg2_id, gen(svar1 svar2)
egen route_agg = group(svar*) , label
drop svar*


* direction of exits
*gen south = port_agg1=="USbound_S" | port_agg2=="USbound_S"
gen south = port_name1=="US1" | port_name2=="US1"
gen south_broad = inlist(port_name1,"US1","US2") | inlist(port_name2,"US1","US2")
gen south_broad1_3 = inlist(port_name1,"US1","US2","US3") | inlist(port_name2,"US1","US2","US3")
gen west = inlist(port_name1,"US3","US4","US5","US6") | inlist(port_name2,"US3","US4","US5","US6")
gen north = inlist(port_name1,"US7","US8","US9") | inlist(port_name2,"US7","US8","US9")


*** Classifying tracks *************
* indicators for port fixed effects (unused??)
gen port1_fe = port1 if port1~=.
replace port1_fe = 0 if port1_fe>100 & port1_fe~=.
gen port2_fe = port2 if port2~=.
replace port2_fe = 0 if port2_fe>100 & port2_fe~=.

* flag for route to AK or HI
gen HIport = inlist(port_agg1,"HI") | inlist(port_agg2,"HI") 
gen AKport = inlist(port_agg1,"AK","Unimak") | inlist(port_agg2,"AK","Unimak") 
gen otherAK = inlist(port1,15,16) | inlist(port2,15,16)
gen HItoHI = inlist(port_agg1,"HI") & inlist(port_agg2,"HI") 
gen AKtoAK = inlist(port_agg1,"AK","Unimak") & inlist(port_agg2,"AK","Unimak") 

* dummy for whether route moves between two ports
gen port2port = (port1<100 & port2<100) 
gen any_port = (port1<100 | port2<100)

* indicator for offshore route
gen offshore = inlist(route_name,"offECAS_offECAN","EguadS_EguadN","WguadS_WguadN","BajaS_BajaN","BajaN_US1","WguadN_offECAS")

gen WCport = (port1<=12 | port2<=12) // all west coast ports (minus ensenada)

* determine if track is to/from CA port
local CAports "1,2,3,4,5,12"
gen CAport = inlist(port1,`CAports') | inlist(port2,`CAports')
gen port2port_CA = inlist(port1,`CAports') & inlist(port2,`CAports') 
gen CAport1 = inlist(port1,`CAports') 

* does vessel go to southern california Port (SD,LALB,Hue)
local SOCALport "2,3,4,5"
gen SOCALport = inlist(port1,`SOCALport') | inlist(port2,`SOCALport')

* determine if track is to/from a major port
local MAJORports "1,3,4,6,7,14,17"
gen MAJORport = inlist(port1,`MAJORports') | inlist(port2,`MAJORports')
gen port2port_MAJOR = inlist(port1,`MAJORports') & inlist(port2,`MAJORports') // if track ONLY moves between major ports

* Routes to LA/LB
local LALBports "3,4"
gen LALB = inlist(port1,`LALBports') | inlist(port2,`LALBports')

* Other Ports
gen SFBay = inlist(port1,1) | inlist(port2,1)
gen SD = inlist(port1,2) | inlist(port2,2)
gen HUE = inlist(port1,5) | inlist(port2,5)
gen SEA_POR = inlist(port1,6,7) | inlist(port2,6,7)
gen POR = inlist(port1,6) | inlist(port2,6)
gen SEA = inlist(port1,7) | inlist(port2,7)

* flag if connecting to a port without container terminal
gen othercargo_port = inlist(port1,2,5,9,10,11,12) | inlist(port2,2,5,9,10,11,12)

* classifying all tracks to LA/LB, other CA, other WC
local CAports "1,2,3,4,5,12"
local SOCALport "2,3,4,5"
gen port_agg = "SoCal" if inlist(port1,`SOCALport') | inlist(port2,`SOCALport') // any track to southern california
replace port_agg = "NoCal" if port_agg=="" & ( inlist(port1,`CAports') | inlist(port2,`CAports') ) // any track to california, but not SoCal
replace port_agg = "OtherWC" if port_agg=="" & ( port1<12  | port2<12 )

gen socal_socal = ( inlist(port1,`SOCALport') & inlist(port2,`SOCALport') ) 

*capture drop port_agg_i
gen port_agg_i = 1 if port_agg=="SoCal"
replace port_agg_i = 2 if port_agg=="NoCal"
replace port_agg_i = 3 if port_agg=="OtherWC"

capture label drop port_agg_i
label define port_agg_i 1 "So. Cal" ///
2	"No. Cal" ///
3	"Other WC" 
label values port_agg_i port_agg_i


* update route type
* coastal
* enter-exits
* interpolated (long)
	* flag interpolated that were part of enter-exits
	* might have some that are not included in enter-exits

* route type 
* "coastal" is west coast to west coast port
* "EnterExit" is study area boundary to west coast
* "LongInterp" is connection from west coast to AK or HI - which are subset of EnterExit
gen route_type = "Coastal" if port2port==1 & ~(HIport==1 | AKport==1)
replace route_type = "EnterExit" if WCport==1 & (port1>100 | port2>100)
replace route_type = "LongInterp" if ( port2port==1 & (HIport==1 | AKport==1) ) & AKtoAK==0 & HItoHI==0

gen route_type_i = 1 if route_type=="Coastal"
replace route_type_i = 2 if route_type=="EnterExit"
replace route_type_i = 3 if route_type=="LongInterp"
capture label drop route_type_i
label define route_type_i 1 "Port-to-Port" ///
2	"Ent/Exit" ///
3	"Long Interp" 
label values route_type_i route_type_i

capture drop enterexit
gen enterexit = (route_type=="EnterExit")
gen porttoport = (route_type=="Coastal")


* generating duplicate flag - indicates whether track is in dataset twice
* also used to restrict sample for interpolated tracks analysis
* tracks show up twice because one is interpolated and one is partial entrance/exit
egen dup_tracks = count(dist) , by ( ves_id dt )
drop if dup_tracks>2
gen dup_tracks_flag = (dup_tracks==2)


* determing if track through SB channel and other waypoints
replace port_id = subinstr(port_id,"130","9999",.) // so no clash with 13

gen LBind = (port1==4) | (port2==4) // creating indicator for LB as opposed to LA
gen SBind = (strpos(port_id, "51")>0)
gen SF_TSS_S = (strpos(port_id, "52")>0)
gen SF_TSS_W = (strpos(port_id, "53")>0)
gen SF_TSS_N = (strpos(port_id, "54")>0)
gen LA_TSS_S = (strpos(port_id, "55")>0)
gen LA_TSS_W = (strpos(port_id, "56")>0)

* crude terminals and other intermediate ports
replace port_id = subinstr(port_id,"130","9999",.)
* updating this - includes both methods for flagging these stops
* elsegundo, rosarito are based on grid, port_id based on track creation
gen crude_term = (elsegundo==1) | (rosarito==1) | (strpos(port_id, "57")>0) | (strpos(port_id, "58")>0) // stops at elsegundo or rosarito
gen inter_port_ind = (crude_term==1) | (strpos(port_id, "13")>0) | (ensenada==1) 
replace port_id = subinstr(port_id,"9999","130",.)

label var SBind "SB Channel"

label define sf_tss_w 0 "Other" 1 "TSS W"
label values SF_TSS_W sf_tss_w

gen elsegundo_ind = (strpos(port_id, "57")>0)
gen rosarito_ind = (strpos(port_id, "58")>0)

gen lasf = (route_name=="LALB_SFBay")

* whether a 100nm boundary is in track's port list - can use to identify vessels moving much farther from coast.
gen leaves100nm = (strpos(port_id, "101")>0) | (strpos(port_id, "102")>0) | (strpos(port_id, "103")>0) | ///
				(strpos(port_id, "104")>0) | (strpos(port_id, "105")>0) | (strpos(port_id, "106")>0) | ///
				(strpos(port_id, "107")>0) | (strpos(port_id, "108")>0) | (strpos(port_id, "109")>0) 

* trips that start and end at same port 
gen samePort = (portSN1==portSN2)

* starting ending dates for port slowdown
local portslowStart = mdy(3,1,2014) // Trying to be cautious (starting congestion at LA/LB much sooner
*local portslowStart = mdy(10,1,2014) // contract expires
*local portslowEnd =   mdy(2,20,2015) // contract resolved
local portslowEnd =   mdy(5,20,2015) // allow 3 months extra to resolve backlog

gen portslow = (date>=`portslowStart') & (date<=`portslowEnd')

*** ship size bins 
egen length_bin = cut(length), at(50(50)400)

   
* route by year indicators
egen ry = group(route_id y)

* Generating some other vessel characteristics
gen twostroke = (powertype=="Diesel 2-Stroke":powertype)
label variable twostroke "2-Stroke Diesel"
gen steam  = (powertype=="Steam Turbine":powertype)
label variable steam "Steam"

gen old = (built<=1999) if built~=.			   
gen FCC = (group=="FCC") if group~=""	
label variable FCC "Container"
label define FCC 0 "Other" 1 "FCC"
label values FCC FCC

* creating indicator for vessel type used in regressions
gen vesseltype_reg = 1 if group_agg_i=="FCC":group_agg_i 
replace vesseltype_reg = 2 if (group_agg_i=="Bulker":group_agg_i) | (group_agg_i=="Other Cargo":group_agg_i)
replace vesseltype_reg = 3 if (group_agg_i=="Tankers":group_agg_i) 
replace vesseltype_reg = 4 if (group_agg_i=="Passenger":group_agg_i) 
label define vesseltype_reg 1 "Container" 2 "Other Cargo" 3 "Tanker"  4 "Passenger" 
label values vesseltype_reg vesseltype_reg
decode vesseltype_reg , gen(vesseltype_regstr)

gen other_cargo = (vesseltype_reg=="Other Cargo":vesseltype_reg) & vesseltype_reg~=.
label variable other_cargo "Other Cargo"

gen tanker = (vesseltype_reg=="Tanker":vesseltype_reg) & vesseltype_reg~=.
label variable tanker "Tankers"


* getting indicator for exempt vessels for CA ECA
* >=400ft (121 meters)
* >= 10,000 GT
* >= 30 liter per cylinder displacement
* not steam 
*gen exempt = ( (steam==1) | ~( length>=121 | gt>=10000 ) ) if steam~=. & length~=. & gt~=. 
*gen exempt_v2 = ( (steam==1) | ~( length>=121 | gt>=10000 | enginedispl_cul>=30 ) ) if steam~=. & length~=. & gt~=. 
gen exempt = (steam==1) | ( length<121 & gt<10000 & enginedispl_cul<30 )  if steam~=. & length~=. & gt~=. 


label variable exempt "Exempt (CA)"

gen treat = (exempt~=1)
label variable treat "Treated (CA)"


**** Policy Dates and Changes
* Policy dates *****************************************************************
scalar ecaCA1_scal = mdy(7,1,2009) /* CA ECA put in place */
scalar ecaCA2_scal = mdy(12,1,2011) /* CA ECA boundary change */
scalar ecaCA3_scal = mdy(1,1,2014) /* CA ECA stringency (.1%) */
scalar ecaNA1_scal = mdy(8,1,2012) /* NA ECA (1%), CA stringency change (1%) */
scalar ecaNA2_scal = mdy(1,1,2015) /* NA ECA stringency (0.1%)*/

* policy dummies
gen ecaCA1 = (date>=ecaCA1_scal)
gen ecaCA2 = (date>=ecaCA2_scal) 
gen ecaCA3 = (date>=ecaCA3_scal) 
gen ecaNA1 = (date>=ecaNA1_scal)
gen ecaNA2 = (date>=ecaNA2_scal)

* eca month indcators (for month plots)
scalar startmonth = ym( year(2009),month(1) )
scalar ecaCA1month = ym( year(ecaCA1_scal),month(ecaCA1_scal) )
scalar ecaCA2month = ym( year(ecaCA2_scal),month(ecaCA2_scal) )
scalar ecaCA3month = ym( year(ecaCA3_scal),month(ecaCA3_scal) )
scalar ecaNA1month = ym( year(ecaNA1_scal),month(ecaNA1_scal) )
scalar ecaNA2month = ym( year(ecaNA2_scal),month(ecaNA2_scal) )

* eca quarter indcators (for month plots)
scalar ecaCA1quarter = yq( year(ecaCA1_scal),quarter(ecaCA1_scal) )
scalar ecaCA2quarter = yq( year(ecaCA2_scal),quarter(ecaCA2_scal) )
scalar ecaCA3quarter = yq( year(ecaCA3_scal),quarter(ecaCA3_scal) )
scalar ecaNA1quarter = yq( year(ecaNA1_scal),quarter(ecaNA1_scal) )
scalar ecaNA2quarter = yq( year(ecaNA2_scal),quarter(ecaNA2_scal) )

* this is indicator for different periods of ECA
gen eca_ind = 0
replace eca_ind = 1 if ecaCA1==1 & ecaCA2==0 
replace eca_ind = 2 if ecaCA2==1 & ecaNA1==0 
replace eca_ind = 3 if ecaNA1==1 & ecaCA3==0
replace eca_ind = 4 if ecaCA3==1 & ecaNA2==0
replace eca_ind = 5 if ecaNA2==1 
 
disp ecaCA2month - ecaCA1month
disp ecaNA1month - ecaCA2month
disp ecaCA3month - ecaNA1month
disp ecaNA2month - ecaCA3month 

label define lab_eca_ind 0 "No ECA" 1 "CA (1.5%), 2009" 2 "CA (1.5%), 2011" ///
				   3 "CA(1%) NA(1%)" 4 "CA(0.1%) NA(1%)" 5 "CA(0.1%) NA(0.1%)" 
label define lab_eca_ind_shrt 0 "No ECA" 1 "CA 2009" 2 "CA 2011" ///
				   3 "CA+NA(1%)" 4 "CA(0.1%)+NA" 5 "CA+NA(0.1%)" 
label define lab_eca_ind_ca 0 "No ECA" 1 "CA 2009" 2 "CA 2011" ///
				   3 "CA 1%" 4 "CA 0.1%" 5 "CA 0.1%" 
label define lab_eca_ind_na 0 "" 1 "" 2 "" ///
				   3 "NA 1%" 4 "NA 1%" 5 "NA 0.1%" 
				   
* labels are CA(1%) indicating level of MGO, did this to save room and because MDO limit only changes when MGO one does
label values eca_ind lab_eca_ind

* this is indicator for different periods of ECA (collapsing 2011 ECA boundary change and 0.1% reduction in CA)
gen eca_ind4 = 0
replace eca_ind4 = 1 if ecaCA1==1 & ecaCA2==0 
replace eca_ind4 = 2 if ecaCA2==1 & ecaNA1==0
replace eca_ind4 = 3 if ecaNA1==1 & ecaNA2==0
replace eca_ind4 = 4 if ecaNA2==1 

label define lab_eca_ind4 0 "No ECA" 1 "CA (1.5%), 2009" 2 "CA (1.5%), 2011" ///
				   3 "CA NA(1%)" 4 "CA(0.1%) NA(0.1%)" 
label values eca_ind4 lab_eca_ind4
				   
* making set of dummies consistent with eca_ind
forvalues v = 1(1)5 {
	gen eca`v' = (eca_ind==`v')
	local lab : label lab_eca_ind `v'
	label var eca`v' "`lab'"
} 

* making set of dummies consistent with eca_ind
forvalues v = 1(1)4 {
	gen eca4_`v' = (eca_ind4==`v')
	local lab : label lab_eca_ind4 `v'
	label var eca4_`v' "`lab'"
} 


* TSS Changes ******************************************************************
/* TSS in Strait of Juan de Fuca codified on January 18, 2011 */
gen poldum_TSSsea = (date>=mdy(1,18,2011)) & ( inlist(port_name1,"Sea") | inlist(port_name2,"Sea") )
label variable poldum_TSSsea "TSS, Sea"
/* Change in Area to be Avoided for Olympic Coast National Marine Sanctuary December 1, 2012 */
gen poldum_ATBAoly = (date>=mdy(12,1,2012)) & ( inlist(port_name1,"Sea") | inlist(port_name2,"Sea") )
label variable poldum_ATBAoly "ATBA, OCNMS"

/* 
Adjustments SF and SB TSS on June 1, 2013;  
Affecting vessels entering LA/LB, SF, Hueneme or passing through the SB Channel
*/
gen poldum_TSSca = (date>=mdy(6,1,2013)) & ( inlist(port_name1,"SFBay","LALB","Hueneme") | inlist(port_name2,"SFBay","LALB","Hueneme") | SBind==1 )
label variable poldum_TSSca "TSS, CA"

/* Occupy Oakland Protests
Shut/slowed oakland on November 2nd 2011
Flagging those vessels that arrive in SF on 2nd and then 5 more days
*/
gen poldum_occoak = (date2>=mdy(11,2,2011)) & (date2<=mdy(11,9,2011)) & port2==1


* eca quarter indcators (for month plots)
scalar TSSca_quarter = yq( year(mdy(6,1,2013)),quarter(mdy(6,1,2013)) )
scalar TSSsea_quarter = yq( year(mdy(1,18,2011)),quarter(mdy(1,18,2011)) )
scalar ATBAoly_quarter = yq( year(mdy(12,1,2012)),quarter(mdy(12,1,2012)) )

*** Calculate Exposure to CA ECA
* calculate fraction of trip within ECA pre ECA (measure of exposure)
bys route_id vesseltype_reg: egen avg_dist_eca09_pre = mean(dist_eca2009) if my<ym(7,2009)
bys route_id vesseltype_reg: egen avg_dist_eca11_pre = mean(dist_eca2011) if my<ym(7,2009)
* replace with zero if less than 5km
* to get to SD should be 17km
replace avg_dist_eca09_pre=0 if avg_dist_eca09_pre<5
replace avg_dist_eca11_pre=0 if avg_dist_eca11_pre<5

*bys route_id vesseltype_reg: egen expos09_km = min(dist_eca2009) if my<ym(7,2009)
*bys route_id vesseltype_reg: egen expos11_km = min(dist_eca2011) if my<ym(7,2009)
bys route_id vesseltype_reg: egen avg_dist_pre = mean(dist) if my<ym(7,2009)
gen expos09_km = avg_dist_eca09_pre
gen expos11_km = avg_dist_eca11_pre
gen expos09_perc = avg_dist_eca09_pre / avg_dist_pre
gen expos11_perc = avg_dist_eca11_pre / avg_dist_pre

bys vesseltype_reg : egen mean_expos09 = mean(expos09_km) if CAport
gen expos09_km_demean = expos09_km - mean_expos09

gen exposed = (expos09_km>0) 
label define exposed 0 "Unexposed" 1 "Exposed" 
label values exposed exposed

*** Calculating average fuel prices to use in cost analysis
*** getting averages in ECA policy windows

sum IFO if inlist(eca_ind,0,1)
gen mean_IFO = r(mean) // if inlist(eca_ind,0,1)

sum LS if inlist(eca_ind,0,1)
gen mean_LS = r(mean) // if inlist(eca_ind,0,1)

***** Calculating variables that depend on compliance
* total damages if tracks are complying with fuel requirements
gen td_comply_cons = td_pre_cons
replace td_comply_cons = td_eca09_cons if ecaCA1==1 & treat	
replace td_comply_cons = td_eca11_cons if ecaCA2==1 & treat

gen td_inm_comply_cons = td_inm_pre_cons
replace td_inm_comply_cons = td_inm_eca09_cons if ecaCA1==1 & treat	
replace td_inm_comply_cons = td_inm_eca11_cons if ecaCA2==1 & treat

gen e_pm_comply_cons = e_pm_pre_cons
replace e_pm_comply_cons = e_pm_eca09_cons if ecaCA1==1 & treat	
replace e_pm_comply_cons = e_pm_eca11_cons if ecaCA2==1 & treat

gen e_so2_comply_cons = e_so2_pre_cons
replace e_so2_comply_cons = e_so2_eca09_cons if ecaCA1==1 & treat	
replace e_so2_comply_cons = e_so2_eca11_cons if ecaCA2==1 & treat

* within ECA
gen e_pm_comply_cons_ineca09 = e_pm_res_eca09_cons
replace e_pm_comply_cons_ineca09 = e_pm_dist_eca09_cons if ecaCA1>=1 & treat
gen e_so2_comply_cons_ineca09 = e_so2_res_eca09_cons
replace e_so2_comply_cons_ineca09 = e_so2_dist_eca09_cons if ecaCA1>=1 & treat

* within 2011 boundaries (accounting for overlap with 2009)
gen e_pm_comply_cons_ineca11 = e_pm_res_eca11_cons
replace e_pm_comply_cons_ineca11 = e_pm_dist_eca11and09_cons + e_pm_res_eca11not09_cons if eca_ind==1 & treat
replace e_pm_comply_cons_ineca11 = e_pm_dist_eca11_cons if eca_ind>=2 & treat

gen e_so2_comply_cons_ineca11 = e_so2_res_eca11_cons
replace e_so2_comply_cons_ineca11 = e_so2_dist_eca11and09_cons + e_so2_res_eca11not09_cons if eca_ind==1 & treat
replace e_so2_comply_cons_ineca11 = e_so2_dist_eca11_cons if eca_ind>=2 & treat

*** selecting fuel prices to use for cost calculations
local IFO_use mean_IFO
local LS_use mean_LS

gen cost_pre_cons = f_cons * `IFO_use' // fuel costs if no eca
gen cost_eca09_cons = f_eca09_cons * `LS_use' + f_out09_cons * `IFO_use' 
gen cost_eca11_cons = f_eca11_cons * `LS_use' + f_out11_cons * `IFO_use'  

gen cost_comply_cons = cost_pre_cons
replace cost_comply_cons = cost_eca09_cons if ecaCA1==1 & treat	
replace cost_comply_cons = cost_eca11_cons if ecaCA2==1 & treat

***** CONSTRUCTING BLENDS 
* 2009 sulfur limits were 1.5% for MGO and 0.5% for MDO
* with resid of 2.7 and distillate of 0.1
	* for 1% would need 0.65 distillate
	* for 1.5% would need 0.46 distillate
	* for 0.5% would need 0.85 distillate
local dist_shr 0.65
	
* weighted avg within ECA plus outside emissions
gen mean_blend = `LS_use'*`dist_shr' + `IFO_use'*(1-`dist_shr')
	
gen cost_eca09_cons_blend = f_eca09_cons * mean_blend + f_out09_cons * `IFO_use' 
gen cost_eca11_cons_blend = f_eca11_cons * mean_blend + f_out11_cons * `IFO_use'  

gen cost_comply_cons_blend = cost_pre_cons
replace cost_comply_cons_blend = cost_eca09_cons_blend if ecaCA1==1 & treat	
replace cost_comply_cons_blend = cost_eca11_cons_blend if ecaCA2==1 & treat	

gen td_inm_eca09_cons_blend = td_inm_dist_eca09_cons*`dist_shr' + td_inm_res_eca09_cons*(1-`dist_shr') + td_inm_res_out09_cons
gen td_inm_eca11_cons_blend = td_inm_dist_eca11_cons*`dist_shr' + td_inm_res_eca11_cons*(1-`dist_shr') + td_inm_res_out11_cons 

gen td_inm_comply_cons_blend = td_inm_pre_cons
replace td_inm_comply_cons_blend = td_inm_eca09_cons_blend if ecaCA1==1 & treat	
replace td_inm_comply_cons_blend = td_inm_eca11_cons_blend if ecaCA2==1 & treat	

***** Calculating difference between dist and resid
gen cost_eca09_cons_d = cost_eca09_cons - cost_pre_cons 
gen td_inm_eca09_cons_d = td_inm_eca09_cons - td_inm_pre_cons
gen e_pm_eca09_cons_d = e_pm_eca09_cons - e_pm_pre_cons

gen cost_eca09_cons_blend_d = cost_eca09_cons_blend - cost_pre_cons 
gen td_inm_eca09_cons_blend_d = td_inm_eca09_cons_blend - td_inm_pre_cons
gen td_eca09_cons_d = td_eca09_cons - td_pre_cons

gen cost_eca11_cons_d = cost_eca11_cons - cost_eca09_cons 
gen td_inm_eca11_cons_d = td_inm_eca11_cons - td_inm_eca09_cons
gen e_pm_eca11_cons_d = e_pm_eca11_cons - e_pm_eca09_cons

gen cost_eca11_cons_blend_d = cost_eca11_cons_blend - cost_eca09_cons_blend 
gen td_inm_eca11_cons_blend_d = td_inm_eca11_cons_blend - td_inm_eca09_cons_blend
gen td_eca11_cons_d = td_eca11_cons - td_eca09_cons

	
* correlated pollution variables		
gen td_corr_cons = td_nox_cons  + td_voc_cons		
gen tdtonne_corr_cons = td_corr_cons / f_cons
gen td_corr_cons_200km = td_nox_cons_200km  + td_voc_cons_200km	
gen td_corr_cons_250km = td_nox_cons_250km  + td_voc_cons_250km	
gen td_corr_cons_300km = td_nox_cons_300km  + td_voc_cons_300km	
gen td_corr_cons_350km = td_nox_cons_350km  + td_voc_cons_350km	
gen td_corr_cons_400km = td_nox_cons_400km  + td_voc_cons_400km	



* flagging interpolated tracks that are included in entrance/exits
* saving all enter/exits with interpolated track id
* merging against full dataset
* also keeping distance of enter/exit track
preserve
keep if route_type == "EnterExit" & track_id_interp~=.
keep track_id_interp dist td_inm_comply_cons cost_comply_cons 
rename track_id_interp track_id
rename dist dist_entexit
rename td_inm_comply_cons td_inm_comply_cons_entexit
rename cost_comply_cons cost_comply_cons_entexit
gen interp_in_enterexit = 1 
save enter_exits, replace
restore

merge 1:1 track_id using enter_exits 
drop _merge
replace interp_in_enterexit=0 if interp_in_enterexit==.
lab var interp_in_enterexit "=1 if interp track in enter/exit"

* flagging enter/exits that have an interpolated track
gen enterexit_in_interp = (route_type=="EnterExit") & track_id_interp~=.
lab var enterexit_in_interp "=1 if enter/exit is interpolated"

*** Entrance/Clearance Ports 
* groupings
forvalues i = 1(1)2 {
	*capture drop port_name`i'_ec
	gen port_name`i'_ec = port_name`i'_ecport 
	replace port_name`i'_ec = "South" ///
			if ( inlist(port_name`i'_ecregion,"CAm","SAm","US_EC","Af","Eur") ) | /// destination with south if CAm, SAm, Europe, US East Coast
			( inlist(port_name`i'_ecregion,"NAm") & ~inlist(port_name`i'_eccountry,"Canada, WC","US_WC") ) //  destination with south if Mexico, or Canadian EC
	replace port_name`i'_ec = "SEAsia_MidEast"  /// 
		if inlist(port_name`i'_ecregion,"MidE","Asia")
	replace port_name`i'_ec = port_name`i'_eccountry  /// 
		if inlist(port_name`i'_eccountry,"China South","China Central","China North","China Taiwan","Hong Kong","Japan","South Korea","Russia, EC")
	replace port_name`i'_ec = port_name`i'_eccountry  /// 
		if inlist(port_name`i'_eccountry,"Canada, WC","Sea")  // grouping canadian, WC ports 
	replace port_name`i'_ec = port_name`i'_ecregion  /// 
		if inlist(port_name`i'_ecregion,"Oce")  // grouping oceania
	* group Taiwan in with China Central and Hong Kong into China South
	replace port_name`i'_ec = "China South"  /// 
		if inlist(port_name`i'_eccountry,"Hong Kong")  // grouping oceania	
	replace port_name`i'_ec = "China Central"  /// 
		if inlist(port_name`i'_eccountry,"China Taiwan")  // grouping oceania	
}



* owners/operators
*** groups are somewhat broader than owner/operator variables
egen owner_group_i = group(owner_group) , label
egen operator_group_i = group(operator_group) , label

* create route names with entrance/clearance routes
	* use boundary or port to port route unless it is entrance exit
	* if no EC route but we know orig/dest is HI from interpolation use that

* grouping Hawaiian routes from interpolation with entrance/clearance
* creating variable with Hawaiian ports grouped (like E/C data)
* using the route name of the track if it was interpolated
gen route_name_interp_hi = route_name_interp
replace route_name_interp_hi = subinstr(route_name_interp_hi,"Hono","HI",.)
replace route_name_interp_hi = subinstr(route_name_interp_hi,"Bpoint","HI",.)
replace route_name_interp_hi = subinstr(route_name_interp_hi,"Hilo","HI",.)
replace route_name_interp_hi = subinstr(route_name_interp_hi,"Kahalui","HI",.)
replace route_name_interp_hi = subinstr(route_name_interp_hi,"Nawil","HI",.)
replace route_name_interp_hi = subinstr(route_name_interp_hi,"PHarbor","HI",.)	
gen hi_interp = strpos(route_name_interp_hi,"HI") // flag all tracks that are interpolated to HI
	
local ec_tags _ec _ecport _eccountry _ecregion
foreach tag of local ec_tags {
	rowsort port_name1`tag' port_name2`tag' , gen(svar1 svar2)
	gen route`tag'_name = subinstr( subinstr(svar1 + "_" + svar2 ," " , "" , . ) , "," ,"" , . ) if svar1~="" & svar2~=""
	replace route`tag'_name = route_name if enterexit==0 // adding route names if not an enter/exit
	replace route`tag'_name = route_name_interp_hi if route`tag'_name=="" & hi_interp==1 // adding route names if not an enter/exit
	egen route`tag' = group(route`tag'_name) , label
	drop svar*
}
drop route_name_interp_hi
drop hi_interp


* E/C south definition
gen south_ec = (port_name1_ec=="South") | (port_name2_ec=="South")
gen japan_ec = (port_name1_ec=="Japan") | (port_name2_ec=="Japan")
gen skorea_ec = (port_name1_ec=="South Korea") | (port_name2_ec=="South Korea")
gen china_ec = strpos(port_name1_ec, "China") | strpos(port_name2_ec, "China")
gen oce_ec = strpos(port_name1_ec, "Oce") | strpos(port_name2_ec, "Oce")


* flagging distance outliers by route and vessel type
* top 1%
egen dist_pctile = xtile(dist) , by(route_id vesseltype) p(99)
gen dist_outlier = (dist_pctile~=1) 

* using entrance/clearance routes
egen dist_pctile_ec = xtile(dist) , by(route_ec vesseltype) p(99)
gen dist_outlier_ec = (dist_pctile_ec~=1) 

